示例#1
0
end genesis

__kernel
void kernel_mc02_orig(unsigned int outer_tc, unsigned int inner_tc, __global __volatile float *arr){

	for (unsigned int it00 = get_local_id(0); it00 < outer_tc; it00 += get_local_size(0)) {
		for (unsigned int it01 = get_local_id(1); it01 < inner_tc; it01 += get_local_size(1)) {
            ${epoch[5]}
        }
	}
 
}
示例#2
0
size_t _CL_OVERLOADABLE
get_global_id(unsigned int dimindx)
{
  switch(dimindx)
    {
        /* TODO: add get_global_offset(X) to these! */
    case 0: return get_local_size(0) * get_group_id(0) + get_local_id(0);
    case 1: return get_local_size(1) * get_group_id(1) + get_local_id(1);
    case 2: return get_local_size(2) * get_group_id(2) + get_local_id(2);
    default: return 0;
    }
}
示例#3
0
/// \fn _copyKernel
/// \brief generate a copy kernel program
compute::program _copyKernel(const compute::context& context)
{
    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
        __kernel void copy_kernel(__global const float *src, __global float *dst)
        {
            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
            
            uint width = get_num_groups(0) * TILE_DIM;
            
            for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS)
            {
                dst[(y+i)*width +x] = src[(y+i)*width + x];
            }
        }
__kernel void memset_uint4(__global int *mem, const int size, __private int val) { \n\
int tid = get_local_id(0); \n\
int bx = (get_group_id(1)) * (get_num_groups(0)) + get_group_id(0); \n\
int i = tid + (bx) * (get_local_size(0)); \n\
//debug \n\
//if (i == 0) { printf(\"memset size = %i value = %i buffer %i \\n\",size,val,mem[0]); } \n\
if (i < size ) { mem[i]=val; } \n\
}" };
int get_global_id(unsigned int dim)

{
	if (dim < _numDimensions)
	{
		return _globalIds[dim] * _localSizes[dim] + get_local_id(dim);
	}
	return 0;
}
示例#6
0
    std::vector<unsigned> get_nodes(unsigned elem_index) const
    {
        std::vector<unsigned> node_conn(m_bulkData.num_nodes(m_localIdToElement[elem_index]));
        const stk::mesh::Entity* nodes = m_bulkData.begin_nodes(m_localIdToElement[elem_index]);
        for(size_t i=0;i<node_conn.size();++i)
            node_conn[i] = get_local_id(nodes[i]);

        return node_conn;
    }
示例#7
0
void ComputeGradinetsRTLR_SetGradients(local float* tmpGradients, global float* gradients, global float* gradientSums)
{
    Reduce_Sum(tmpGradients);

    if (get_local_id(0) == 0)
    {
        int ijValueIndex = get_group_id(0);
        if (gradients != null) gradients[ijValueIndex] = tmpGradients[0];
        if (gradientSums != null) gradientSums[ijValueIndex] += tmpGradients[0];
    }
}
__kernel void kernel_scan(__global float* input, __global float* output) {
    int global_idx = get_global_id(0);
    int local_idx = get_local_id(0);
    int block_size = get_local_size(0);
    int group_id = get_group_id(0);

    output[global_idx] = input[global_idx];
    mem_fence(CLK_GLOBAL_MEM_FENCE);

    for(int i = 1; i < block_size; i <<= 1) {
        if(global_idx >= i)
            output[global_idx] += output[global_idx - i];
        mem_fence(CLK_GLOBAL_MEM_FENCE);
    }
}
示例#9
0
__kernel void kernel_reduce(__global float* input, __global float* output) {
    int global_idx = get_global_id(0);
    int local_idx = get_local_id(0);
    int block_size = get_local_size(0);
    int group_id = get_group_id(0);

    for(int i = block_size/2; i > 0; i >>= 1) {
        if(local_idx < i)
            input[global_idx] += input[global_idx + i];

        barrier(CLK_GLOBAL_MEM_FENCE);
    }

    if(local_idx == 0)
        output[group_id] = input[global_idx];
}
__kernel void custom_daxpy_kernel(
									__global double * x,
									const double c
								 )
{
	// variables
	int localIndex;
	
	// function body
	// get the index we're working on
	localIndex = get_local_id( 0 );
	// output[ index ] = input[ index ] + c
	x[ localIndex ] = x[ localIndex ] + c;
	
	// clean up
	return;
}
示例#11
0
文件: metric_area.c 项目: NHALX/llio
__kernel void metric_area(
    ulong_t linext_offset,
    lattice_info info,
    __global ideal_t *linext,
    DB,
    llf_criteria cfg,
    __local ideal_t *pasv_scratch,
    __local result_t* scratch,
    __global result_t* result)
{
    result_t metric = { 0, 0 };
    count_t nth_extension = linext_offset + get_global_id(0);

    linext       += info.linext_width * get_global_id(0);
    pasv_scratch += PASV_SCRATCH_LEN(info.linext_width) * get_local_id(0);

    ZERO_INIT(pasv_scratch, PASV_SCRATCH_LEN(info.linext_width));

    metric = RateBuildpath(linext, db_items, &cfg, pasv_scratch, &info, nth_extension);
    Reduce(metric, scratch, result);
}
	__kernel void TRIPLE_DGEMM_UPDATE_192_12_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
{
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 12 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8

	//Work group size is [12]
	//global work size is [96*number of blocks]
	//each work item in each work group is responsible for every element in that row
	//each work group is responsible for one gemm;\


	////////////// A12*invA22
	const uint gidx = get_group_id(0);
	const uint idx = get_local_id(0);

	const uint page = gidx % npages;
	const uint page_block = page / 8;//8 pages per page block
	const uint page_index_in_block = page % 8;


	__global double *B, *C;
	__local double lA[12][12];
	__local double lB[12][12];
	double privateC[12] = { (double)0 };

	//decide A12 location for each page
	Ain = Ain + offAin;
	Ain += (page*blk * 2 + blk) * lda + page * 2 * blk;

	//decide invA22 (B) location for each page
	B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk + blk;

	//decide invA12 location for each page
	C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk;

	//read A and B into LDS no transpose operated here
	lA[idx][0] = Ain[idx];
	lA[idx][1] = Ain[idx + lda];
	lA[idx][2] = Ain[idx + lda * 2];
	lA[idx][3] = Ain[idx + lda * 3];
	lA[idx][4] = Ain[idx + lda * 4];
	lA[idx][5] = Ain[idx + lda * 5];
	lA[idx][6] = Ain[idx + lda * 6];
	lA[idx][7] = Ain[idx + lda * 7];
	lA[idx][8] = Ain[idx + lda * 8];
	lA[idx][9] = Ain[idx + lda * 9];
	lA[idx][10] = Ain[idx + lda * 10];
	lA[idx][11] = Ain[idx + lda * 11];

	lB[idx][0] = B[idx];
	lB[idx][1] = B[idx + NB];
	lB[idx][2] = B[idx + NB * 2];
	lB[idx][3] = B[idx + NB * 3];
	lB[idx][4] = B[idx + NB * 4];
	lB[idx][5] = B[idx + NB * 5];
	lB[idx][6] = B[idx + NB * 6];
	lB[idx][7] = B[idx + NB * 7];
	lB[idx][8] = B[idx + NB * 8];
	lB[idx][9] = B[idx + NB * 9];
	lB[idx][10] = B[idx + NB * 10];
	lB[idx][11] = B[idx + NB * 11];
	barrier(CLK_LOCAL_MEM_FENCE);

	//do math

	uint i = 0;

	do{
		privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);
		privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);
		privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);
		privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);
		privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);
		privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);
		privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);
		privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);
		privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);
		privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);
		privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);
		privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);
		//mem_fence(CLK_LOCAL_MEM_FENCE);
		i = i + 1;
	} while (i < 12);

	i = 0;
	do{
		C[NB*i + idx] = privateC[i];
		i = i + 1;
	} while (i < 12);

	////////////// -invA11*invA12
	barrier(CLK_GLOBAL_MEM_FENCE);
	//A is moving to invA11
	__global double *A;
	A = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2) * NB + (page % 4) * 2 * blk;
	//both B and C are pointing at invA12
	B = C;

	//read A and B into LDS no transpose operated here
	lA[idx][0] = A[idx];
	lA[idx][1] = A[idx + NB];
	lA[idx][2] = A[idx + NB * 2];
	lA[idx][3] = A[idx + NB * 3];
	lA[idx][4] = A[idx + NB * 4];
	lA[idx][5] = A[idx + NB * 5];
	lA[idx][6] = A[idx + NB * 6];
	lA[idx][7] = A[idx + NB * 7];
	lA[idx][8] = A[idx + NB * 8];
	lA[idx][9] = A[idx + NB * 9];
	lA[idx][10] = A[idx + NB * 10];
	lA[idx][11] = A[idx + NB * 11];

	lB[idx][0] = B[idx];
	lB[idx][1] = B[idx + NB];
	lB[idx][2] = B[idx + NB * 2];
	lB[idx][3] = B[idx + NB * 3];
	lB[idx][4] = B[idx + NB * 4];
	lB[idx][5] = B[idx + NB * 5];
	lB[idx][6] = B[idx + NB * 6];
	lB[idx][7] = B[idx + NB * 7];
	lB[idx][8] = B[idx + NB * 8];
	lB[idx][9] = B[idx + NB * 9];
	lB[idx][10] = B[idx + NB * 10];
	lB[idx][11] = B[idx + NB * 11];
	barrier(CLK_LOCAL_MEM_FENCE);

	//do math

	i = 0;
	privateC[0] = 0;
	privateC[1] = 0;
	privateC[2] = 0;
	privateC[3] = 0;
	privateC[4] = 0;
	privateC[5] = 0;
	privateC[6] = 0;
	privateC[7] = 0;
	privateC[8] = 0;
	privateC[9] = 0;
	privateC[10] = 0;
	privateC[11] = 0;
	do{
		privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);
		privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);
		privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);
		privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);
		privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);
		privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);
		privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);
		privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);
		privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);
		privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);
		privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);
		privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);
		//mem_fence(CLK_LOCAL_MEM_FENCE);
		i = i + 1;
	} while (i < 12);

	i = 0;
	do{
		C[NB*i + idx] = -1 * privateC[i];
		i = i + 1;
	} while (i < 12);

}
示例#13
0
MAYINLINE
G4double
G4NewNavigation_ComputeStep(
			G4NewNavigation *This,
			G4ThreeVector localPoint,
			G4ThreeVector localDirection,
			const G4double currentProposedStepLength,
			G4double *newSafety,
			G4NavigationHistory *history,
			G4bool *validExitNormal,
			G4ThreeVector *exitNormal,
			G4bool *exiting,
			G4bool *entering,
			GEOMETRYLOC G4VPhysicalVolume *(*pBlockedPhysical)
			 ,SHAREDMEM int * Numbers_Of_Solid,
			 SHAREDMEM int * Sum_Of_Solids,
		     SHAREDTYPE SolidInfo  * Solids,
			 SHAREDTYPE ResultInfo * Result_For_Current_Solid,
		     SHAREDTYPE FinalResult * Compacter_Result,
			 SHAREDMEM bool * noStepArray,
			 SHAREDMEM PointInformation * LocationArray,
			 GEOMETRYLOC G4SmartVoxelNode  *nullVNode,
			 G4bool cur_vol_local
#ifdef CHECK 
			,GEOMETRYLOC float * Result
#endif			
			)
{
	
  GEOMETRYLOC G4VPhysicalVolume *motherPhysical, *samplePhysical,
	*blockedExitedVol = GEOMETRYNULL;
  GEOMETRYLOC G4LogicalVolume *motherLogical;
  GEOMETRYLOC G4VSolid *motherSolid;
  G4ThreeVector sampleDirection;
  G4double ourStep=currentProposedStepLength, motherSafety, ourSafety;
  G4int sampleNo; // , localNoDaughters;
    
  
  //EDIT: Defining these here for global scope.
  GEOMETRYLOC  G4VSolid *sampleSolid;
  G4AffineTransform sampleTf;
  const G4ThreeVector samplePoint;
  G4double sampleSafety;

  int PrevSum;
	//  PrevSum stores the sum of all solids of one type so far. 
  int Count_of_Solid_type;
	// This integer is to fill up the shared mem array for solid types

// For definition of shared memory arrays see kernel trace in gpu.c 
// _--------------------------------__   

  G4bool initialNode;
  GEOMETRYLOC const G4SmartVoxelNode *curVoxelNode;
  G4int curNoVolumes, contentNo;
  G4double voxelSafety;
  G4double sampleStep;
  	
#if ( GLOBAL_MODE  == 1)
  int locationId = get_global_id(0);
#else  
  int locationId = get_local_id(0);
#endif

  motherPhysical = G4NavigationHistory_GetTopVolume( history );
  motherLogical = G4VPhysicalVolume_GetLogicalVolume(motherPhysical);
  motherSolid = G4LogicalVolume_GetSolid(motherLogical);
  
  //
  // Compute mother safety
  //

  motherSafety = G4VSolid_DistanceToOut(motherSolid, localPoint);
  ourSafety = motherSafety;                 // Working isotropic safety
 
  
  //
  // Compute daughter safeties & intersections
  //

  // Exiting normal optimisation
  //
  if ( *exiting && *validExitNormal )
  {
		if ( G4ThreeVector_dot(localDirection,*exitNormal)>=kMinExitingNormalCosine )
		{
			// Block exited daughter volume
			//
			blockedExitedVol = *pBlockedPhysical;
			ourSafety = 0;
		}
  }
  *exiting = false;
  *entering = false;

#ifdef USE_BLIST
  G4NewNavigation_EnlargeAndResetBlist( This, G4LogicalVolume_GetNoDaughters(motherLogical) );
#endif
  
  // noStepLocal -> Local boolean storing the noStep value for the current track.
  // noStepAll -> Value returned after a reduction of all noStepLocals

  initialNode = true;
  G4bool noStepLocal = true;
  G4bool noStepAll = true;
 // GEOMETRYLOC G4SmartVoxelNode * nullVNodePointer;
  //G4SmartVoxelNode nullNode;
  //nullVNodePointer = &nullNode;
  int counter = -1;
  while ( noStepAll )
  {	 counter ++;
        if( !noStepLocal )
        {   
             curVoxelNode = nullVNode;
             curNoVolumes = 0;
        }
		else
		{
		    curVoxelNode = This->fVoxelNode;
			curNoVolumes = G4VoxelNode_GetNoContained( curVoxelNode );
		}
		int number_of_volume_types = Solidcount, Curr_vol_type=0;
			// Volume type is from the enum ESolids in everything.h

		int current_solid_sum = 0;
		// if (GLOBAL_MODE ==1)
		//     number_of_threads = get_global_size(0);
		int number_of_threads = get_local_size(0);
	 
		// STEP 1:  Iteration through all volume types. Min is calculated one type at a time.
		//REMEMBER:EDIT: Changed starting value of Curr_vol_type to 1	
		for (Curr_vol_type = 1; Curr_vol_type < number_of_volume_types; (Curr_vol_type)++ )
		{	
			
			current_solid_sum = 0;
			int number_of_solids;
			number_of_solids = curVoxelNode->SolidType[ Curr_vol_type ];
				// The number of solids of this type in the current voxel for the current track
				// NOTE: There may be an overhead here. If more than one thread is in the same voxel and reads this data, mem access not coalesced
		
            Numbers_Of_Solid[ locationId ] = number_of_solids;
                // At this stage a parallel scan sum has to be called which updates the sums for all the solids. The function is inlined.
			
			
		    BARRIER_FLEXIBLE;
		        // Barrier to ensure that all threads have filled up the shared mem array. See everything.h for definition
			
			//int all_threads = get_global_size(1);
			
			Prefix_Sum( Numbers_Of_Solid, Sum_Of_Solids, number_of_threads );
				// The array Sum_Of_Solids stores the final result after the Prefix sum scan. 
			
				//Result[PrevSum + current_solid_sum ] = locationId;

			PrevSum = Sum_Of_Solids[ locationId ];
			
			
			
			// NOTE: The use of contentNo below is to maintain some resemblance to Otto's definition in VoxelNavigation.c 
			if ( noStepLocal )
			{
			  for( contentNo = curNoVolumes-1; contentNo>=0 ; contentNo--)
			  {	
				sampleNo = G4VoxelNode_GetVolume(curVoxelNode,contentNo);
				samplePhysical = G4LogicalVolume_GetDaughter(motherLogical,sampleNo);
				//if ( samplePhysical != blockedExitedVol )
				{	
					// NOTE: blockedExitedVol check makes sense for the serial version. Does not make as much sense to keep it on in the parallel version as well.
					sampleSolid = G4LogicalVolume_GetSolid( G4VPhysicalVolume_GetLogicalVolume( samplePhysical ));
					
					// NOTE: We iterate over all solids and compare if the solid type is what we are looking for. If it is then it is added
					// to the array Solids in shared mem. However, a better implementation is to have a way such that the solids are returned in sorted order 
					// the first place.

					if( sampleSolid->type == Curr_vol_type)
					{	
						// Should the solid be stored or the Physical Volume??
						SolidInfo Info = { samplePhysical, locationId };
						Solids[ PrevSum + current_solid_sum ] = Info;
						
						// PrevSum is the sum of all solids of that type for all threads upto this element not including the current thread.
						// current_solid_sum at the end of all iterations should be equal to the number_of_solids.
		      			current_solid_sum++;
					}
				  }
			   } 
			}
			//EDIT: Change to BARRIER_LOCAL when not testing GLOBAL_MODE 
            BARRIER_FLEXIBLE;
			
			//MODIFY : Perhaps a check is in order here.
			// One check at this point could be whether current_solid_sum is equal to the number_of_solids.
			// Before proceeding to call the kernel for calculating the min, check if code works up to this point
			
			
			
			int Total_solids_of_this_type;
			Total_solids_of_this_type = Numbers_Of_Solid[ number_of_threads - 1 ] + Sum_Of_Solids[ number_of_threads - 1 ];
			
			if ( Total_solids_of_this_type > BlockSize * Multiplier)
		   {
			   Total_solids_of_this_type = BlockSize * Multiplier;
			   // Update Errors or return error from here?
			}
		   
			
				// Checking a candidate solid of current type.
				int k=0;
				int iterations = (Total_solids_of_this_type / number_of_threads);
				for( k = 0; k < iterations ; k++ )
				{	
					int Work_id = locationId + number_of_threads*k;
					if( ( Work_id ) < Total_solids_of_this_type)
					{	
						GEOMETRYLOC G4VPhysicalVolume * candPhysical = ( Solids[ Work_id  ].PVolume);
						GEOMETRYLOC G4VSolid * candSolid = ((candPhysical)->flogical)->fSolid;
						int candId = Solids[ Work_id].trackId;
						G4ThreeVector candGlobalPoint = LocationArray[candId].Point;
						G4ThreeVector candGlobalDirection = LocationArray[candId].Direction;
						G4AffineTransform candTf = G4AffineTransform_create_full(
					                                G4VPhysicalVolume_GetObjectRotationValue(candPhysical),
													G4VPhysicalVolume_GetTranslation(candPhysical));

						G4AffineTransform_Invert( &candTf );
				     	const G4ThreeVector candPoint=
						           G4AffineTransform_TransformPoint(&candTf,candGlobalPoint);
						const G4double candSafety = G4VSolid_DistanceToIn( candSolid, candPoint );
						const G4ThreeVector candDirection = G4AffineTransform_TransformAxis( &candTf, candGlobalDirection );
						const G4double candStep = G4VSolid_DistanceToIn_full( candSolid, candPoint, candDirection );
							// Result_For_Current_Solid should hold data for the sampleSafety and the sampleStep
						// NOTE: In this version of navigation the step is calculated along with the safety. It is not checked if
						// step<safety; the safety is only calculated here because the physics may require it later.
						ResultInfo Result_of_Solid = { candSafety, candStep, candId, candPhysical } ;
						Result_For_Current_Solid[ Work_id ] = Result_of_Solid ;					
					}
				    BARRIER_FLEXIBLE;
					// REMOVE: Barrier used in debugging stage. Remove when done.
				}
			
			    	
				// At this point all the safeties and steps have been calculated, now to find the minimum step for the current solid per track.
				// find_minimum...
				// Most basic implementation for finding minimum, a proper min finding algorithm that takes into account threadIds is hard to find.
				
				Find_minimum ( Result_For_Current_Solid, Compacter_Result, PrevSum, number_of_solids );
					// Minimum finding function that finds the minimum step per track in Result_For_Current_Solid and
					// stores the minimum step, the safety and the pointer to the sampleSolid ( physical? ) 
					// The minimum is basically compared to the existing value and stored if smaller. This way find_minimum does not
					// care about which solid type is currently being processed.
					// See gpu.c where Compacter Results initial step value is set to kInfinfity.
					//if( Curr_vol_type ==1)	

		}
		
			//if( Curr_vol_type == 1)
			
	    if (  Compacter_Result[ locationId ].step <= ourStep )	
       {
	     ourStep = Compacter_Result[ locationId ].step;
	 	 *pBlockedPhysical = Compacter_Result[ locationId ].PVolume;
	     *entering = true;
		 *exiting = false;
		 
		}
		
	    if (initialNode)
	    {	
			initialNode = false;
			voxelSafety = G4NewNavigation_ComputeVoxelSafety(This,localPoint);
			if ( voxelSafety<ourSafety )
			{
				ourSafety = voxelSafety;
			}

		    if ( currentProposedStepLength<ourSafety )
			{
				// Guaranteed physics limited
				//      
				noStepLocal = false;
		        *entering = false;
				*exiting = false;
			    *pBlockedPhysical = GEOMETRYNULL;
				ourStep = kInfinity;
			}
			else
			{
				//
				// Compute mother intersection if required
				//
				if ( motherSafety<=ourStep )
				{
			
			        G4double motherStep =
						G4VSolid_DistanceToOut_full( motherSolid, localPoint, localDirection,
                                         true, validExitNormal, exitNormal);

					if ( motherStep<=ourStep )
					{
						ourStep = motherStep;
						*exiting = true;
						*entering = false;
						if ( *validExitNormal )
						{
							G4RotationMatrix rot = G4VPhysicalVolume_GetObjectRotationValue(motherPhysical);
							G4RotationMatrix inv = G4RotationMatrix_inverse(&rot);
				
							*exitNormal = G4RotationMatrix_apply( &inv, *exitNormal );
						}
					}
					else
					{
						*validExitNormal = false;
					}
				}	
			}
			*newSafety = ourSafety;
		}
	
		if (noStepLocal)
		{
			noStepLocal = G4NewNavigation_LocateNextVoxel(This, localPoint, localDirection, ourStep);
		}

		noStepArray[ locationId ] = noStepLocal;
		BARRIER_FLEXIBLE;

		noStepAll = NoStepReduction( noStepArray, number_of_threads);
		
		//Prefix_Sum ( noStepArray, noStepArray, number_of_threads);
		
 }  // end -while (noStep)- loop

 // Double check to make sure all threads have reached here before exiting

BARRIER_FLEXIBLE;

	//Result[ locationId ] = ourStep;	
  return ourStep;
}
	__kernel void TRIPLE_DGEMM_UPDATE_192_24_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)
{ 
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 24 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4

	//Work group size is [24, 2]
	//global work size is [96*number of blocks, 2]
	//each work item in each work group is responsible for 12 elements (half) in that row
	//each work group is responsible for one gemm;


	////////////// A12*invA22
	const uint gidx = get_group_id(0); 
	const uint gidy = get_group_id(1); 
	const uint idx = get_local_id(0); 
	const uint idy = get_local_id(1); 

	const uint page = gidx % npages; //0-3 for 192; 0-1 for 96
	const uint page_block = page / 4; //4 pages per page block



	__global double *B, *C; 
	__local double lA[24][24]; 
	__local double lB[24][24]; 
	double privateC[12] = { (double)0 }; 

	//decide A12 location for each page
	Ain = Ain + offAin; 
	Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; 

	//decide invA22 (B) location for each page
	B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk + blk; 

	//decide invA12 location for each page
	C = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; 

	//read A and B into LDS no transpose operated here
	//each work iteam loads half a row
	lA[idx][0 + idy * 12] = Ain[idx + idy * 12 * lda]; 
	lA[idx][1 + idy * 12] = Ain[idx + lda + idy * 12 * lda]; 
	lA[idx][2 + idy * 12] = Ain[idx + lda * 2 + idy * 12 * lda]; 
	lA[idx][3 + idy * 12] = Ain[idx + lda * 3 + idy * 12 * lda]; 
	lA[idx][4 + idy * 12] = Ain[idx + lda * 4 + idy * 12 * lda]; 
	lA[idx][5 + idy * 12] = Ain[idx + lda * 5 + idy * 12 * lda]; 
	lA[idx][6 + idy * 12] = Ain[idx + lda * 6 + idy * 12 * lda]; 
	lA[idx][7 + idy * 12] = Ain[idx + lda * 7 + idy * 12 * lda]; 
	lA[idx][8 + idy * 12] = Ain[idx + lda * 8 + idy * 12 * lda]; 
	lA[idx][9 + idy * 12] = Ain[idx + lda * 9 + idy * 12 * lda]; 
	lA[idx][10 + idy * 12] = Ain[idx + lda * 10 + idy * 12 * lda]; 
	lA[idx][11 + idy * 12] = Ain[idx + lda * 11 + idy * 12 * lda]; 

	lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB]; 
	lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB]; 
	lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB]; 
	lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB]; 
	lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB]; 
	lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB]; 
	lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB]; 
	lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB]; 
	lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB]; 
	lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB]; 
	lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB]; 
	lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB]; 
	barrier(CLK_LOCAL_MEM_FENCE); 

	//do math

	uint i = 0; 

	do{ 
		privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); 
		privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); 
		privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); 
		privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); 
		privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); 
		privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); 
		privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); 
		privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); 
		privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); 
		privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); 
		privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); 
		privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); 
		i = i + 1; 
	} while (i < 24); 

	i = 0; 
	do{ 
		C[NB*idy * 12 + NB*i + idx] = privateC[i]; 
		i = i + 1; 
	} while (i < 12); 
}
示例#15
0
__kernel void SliceTimingCorrection(__global float* Corrected_Volumes, 
                                    __global const float* Volumes, 									 
									__private float delta, 									 
									__private int DATA_W, 
									__private int DATA_H, 
									__private int DATA_D, 
									__private int DATA_T)
{
	int x = get_global_id(0);
	int y = get_global_id(1);

	int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)};

	if (x >= DATA_W || y >= DATA_H)
		return;

	float t0, t1, t2, t3;

	// Forward interpolation
	if (delta > 0.0f)
	{
		t0 = Volumes[Calculate3DIndex(x,y,0,DATA_W,DATA_H)];
		t1 = t0;
		t2 = Volumes[Calculate3DIndex(x,y,1,DATA_W,DATA_H)];
		t3 = Volumes[Calculate3DIndex(x,y,2,DATA_W,DATA_H)];

		// Loop over timepoints
		for (int t = 0; t < DATA_T - 3; t++)
		{
			// Cubic interpolation in time
			Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); 
		
			// Shift old values backwards
			t0 = t1;
			t1 = t2;
			t2 = t3;

			// Read one new value
			t3 = Volumes[Calculate3DIndex(x,y,t+3,DATA_W,DATA_H)];
		}

		int t = DATA_T - 3;	
		Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); 
	
		t = DATA_T - 2;
		t0 = t1;
		t1 = t2;
		t2 = t3;	
		Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); 

		t = DATA_T - 1;
		t0 = t1;
		t1 = t2;
		Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); 
	}
	// Backward interpolation
	else
	{
		delta = 1.0f - (-delta);

		t0 = Volumes[Calculate3DIndex(x,y,0,DATA_W,DATA_H)];
		t1 = t0;
		t2 = t0;
		t3 = Volumes[Calculate3DIndex(x,y,1,DATA_W,DATA_H)];

		// Loop over timepoints
		for (int t = 0; t < DATA_T - 2; t++)
		{
			// Cubic interpolation in time
			Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); 
		
			// Shift old values backwards
			t0 = t1;
			t1 = t2;
			t2 = t3;

			// Read one new value
			t3 = Volumes[Calculate3DIndex(x,y,t+2,DATA_W,DATA_H)];
		}

		int t = DATA_T - 2;	
		Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); 
	
		t = DATA_T - 1;
		t0 = t1;
		t1 = t2;
		t2 = t3;	
		Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); 
	}
示例#16
0
		value stride2 sample dist1 report(1)
		value offset sample dist1 report(1)
		${varj} = arr[${stride1}*it00 + ${stride2}*it01 + ${offset}];
	end

	feature epoch
		value numcomps sample compdist
		${computation[${numcomps}]}
		${access}
	end


	feature loop
		value linterchange sample dist4 report(1)
		genif ${linterchange}==0
			for (unsigned int it00 = get_local_id(0); it00 < outer_tc; it00 += get_local_size(0)) {
				for (unsigned int it01 = get_local_id(1); it01 < inner_tc; it01 += get_local_size(1)) {
					${epoch[${numepochs}]}
				}
			}
		end
		genif ${linterchange} == 1
			for (unsigned int it01 = get_local_id(0); it01 < inner_tc; it01 += get_local_size(0)) {
				for (unsigned int it00 = get_local_id(1); it00 < outer_tc; it00 += get_local_size(1)) {
					${epoch[${numepochs}]}
				}
			}
		end
	end

	generate 5 with dist1 = {1:10},numvardist = {2:8},epochdist = {1:5},compdist = {1:5},dist4 = {0:1}
示例#17
0
__kernel void CalculateStatisticalMapsGLMBayesian(__global float* Statistical_Maps,
												  __global float* Beta_Volumes,
												  __global float* AR_Estimates,
		                                          __global const float* Volumes,
		                                          __global const float* Mask,
		                                          __global const int* Seeds,
		                                          __constant float* c_X_GLM,
		                                          __constant float* c_InvOmega0,
											      __constant float* c_S00,
											      __constant float* c_S01,
											      __constant float* c_S11,
		                                          __private int DATA_W,
		                                          __private int DATA_H,
		                                          __private int DATA_D,
		                                          __private int NUMBER_OF_VOLUMES,
		                                          __private int NUMBER_OF_REGRESSORS,
											      __private int NUMBER_OF_ITERATIONS,
												  __private int slice)
{
	int x = get_global_id(0);
	int y = get_global_id(1);
	int z = get_global_id(2);

	int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)};

	if (x >= DATA_W || y >= DATA_H || z >= DATA_D)
		return;

	if ( Mask[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)] != 1.0f )
	{
		Statistical_Maps[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = 0.0f;
		Statistical_Maps[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = 0.0f;
		Statistical_Maps[Calculate4DIndex(x,y,slice,2,DATA_W,DATA_H,DATA_D)] = 0.0f;
		Statistical_Maps[Calculate4DIndex(x,y,slice,3,DATA_W,DATA_H,DATA_D)] = 0.0f;
		Statistical_Maps[Calculate4DIndex(x,y,slice,4,DATA_W,DATA_H,DATA_D)] = 0.0f;
		Statistical_Maps[Calculate4DIndex(x,y,slice,5,DATA_W,DATA_H,DATA_D)] = 0.0f;

		Beta_Volumes[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = 0.0f;
		Beta_Volumes[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = 0.0f;

		AR_Estimates[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)] = 0.0f;

		return;
	}

	// Get seed from host
	int seed = Seeds[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)];

	// Prior options
	float iota = 1.0f;                 // Decay factor for lag length in prior for rho.
	float r = 0.5f;                    // Prior mean on rho1
	float c = 0.3f;                    // Prior standard deviation on first lag.
	float a0 = 0.01f;                  // First parameter in IG prior for sigma^2
	float b0 = 0.01f;                  // Second parameter in IG prior for sigma^2

	float InvA0 = c * c;

	// Algorithmic options
	float prcBurnin = 10.0f;             // Percentage of nIter used for burnin. Note: effective number of iter is nIter.

	float beta[2];
	float betaT[2];

	int nBurnin = (int)round((float)NUMBER_OF_ITERATIONS*(prcBurnin/100.0f));

	int probability1 = 0;
	int probability2 = 0;
	int probability3 = 0;
	int probability4 = 0;
	int probability5 = 0;
	int probability6 = 0;
	
	float m00[2];
	float m01[2];
	float m10[2];
	float m11[2];

	m00[0] = 0.0f;
	m00[1] = 0.0f;

	m01[0] = 0.0f;
	m01[1] = 0.0f;

	m10[0] = 0.0f;
	m10[1] = 0.0f;

	m11[0] = 0.0f;
	m11[1] = 0.0f;

	float g00 = 0.0f;
	float g01 = 0.0f;
	float g11 = 0.0f;

	float old_value = Volumes[Calculate3DIndex(x,y,0,DATA_W,DATA_H)];

	m00[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + 0] * old_value;
	m00[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + 0] * old_value;

	g00 += old_value * old_value;

	for (int v = 1; v < NUMBER_OF_VOLUMES; v++)
	{
		float value = Volumes[Calculate3DIndex(x,y,v,DATA_W,DATA_H)];

		m00[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + v] * value;
		m00[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + v] * value;

		m01[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + v] * old_value;
		m01[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + v] * old_value;

		m10[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + (v - 1)] * value;
		m10[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + (v - 1)] * value;

		m11[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + (v - 1)] * old_value;
		m11[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + (v - 1)] * old_value;

		g00 += value * value;
		g01 += value * old_value;
		g11 += old_value * old_value;

		old_value = value;
	}
	
	float InvOmegaT[2][2];
	float OmegaT[2][2];
	float Xtildesquared[2][2];
	float XtildeYtilde[2];
	float Ytildesquared;

	Xtildesquared[0][0] = c_S00[0 + 0*2];
	Xtildesquared[0][1] = c_S00[0 + 1*2];
	Xtildesquared[1][0] = c_S00[1 + 0*2];
	Xtildesquared[1][1] = c_S00[1 + 1*2];
		
	XtildeYtilde[0] = m00[0];
	XtildeYtilde[1] = m00[1];

	Ytildesquared = g00;

	float sigma2;
	float rho, rhoT, rhoProp, bT;

	rho = 0.0f;

	// Loop over iterations
	for (int i = 0; i < (nBurnin + NUMBER_OF_ITERATIONS); i++)
	{
		InvOmegaT[0][0] = c_InvOmega0[0 + 0 * NUMBER_OF_REGRESSORS] + Xtildesquared[0][0];
		InvOmegaT[0][1] = c_InvOmega0[0 + 1 * NUMBER_OF_REGRESSORS] + Xtildesquared[0][1];
		InvOmegaT[1][0] = c_InvOmega0[1 + 0 * NUMBER_OF_REGRESSORS] + Xtildesquared[1][0];
		InvOmegaT[1][1] = c_InvOmega0[1 + 1 * NUMBER_OF_REGRESSORS] + Xtildesquared[1][1];
		Invert_2x2(InvOmegaT, OmegaT);

		betaT[0] = OmegaT[0][0] * XtildeYtilde[0] + OmegaT[0][1] * XtildeYtilde[1];
		betaT[1] = OmegaT[1][0] * XtildeYtilde[0] + OmegaT[1][1] * XtildeYtilde[1];

		float aT = a0 + (float)NUMBER_OF_VOLUMES/2.0f;
		float temp[2];
		temp[0] = InvOmegaT[0][0] * betaT[0] + InvOmegaT[0][1] * betaT[1];
		temp[1] = InvOmegaT[1][0] * betaT[0] + InvOmegaT[1][1] * betaT[1];
		bT = b0 + 0.5f * (Ytildesquared - betaT[0] * temp[0] - betaT[1] * temp[1]);

		// Block 1 - Step 1a. Update sigma2
		sigma2 = gamrnd(aT,bT,&seed);
		
		// Block 1 - Step 1b. Update beta | sigma2
		MultivariateRandom2(beta,betaT,OmegaT,sigma2,&seed);
		
		if (i > nBurnin)
		{
			if (beta[0] > 0.0f)
			{
				probability1++;
			}

			if (beta[1] > 0.0f)
			{
				probability2++;
			}

			if (beta[0] < 0.0f)
			{
				probability3++;
			}

			if (beta[1] < 0.0f)
			{
				probability4++;
			}

			if ((beta[0] - beta[1]) > 0.0f)
			{
				probability5++;
			}

			if ((beta[1] - beta[0]) > 0.0f)
			{
				probability6++;
			}
		}  

		// Block 2, update rho
		float zsquared = 0.0f;
		float zu = 0.0f;
		float old_eps = 0.0f;

		// Calculate residuals
		for (int v = 1; v < NUMBER_OF_VOLUMES; v++)
		{
			float eps = Volumes[Calculate3DIndex(x,y,v,DATA_W,DATA_H)];
			eps -= c_X_GLM[NUMBER_OF_VOLUMES * 0 + v] * beta[0];
			eps -= c_X_GLM[NUMBER_OF_VOLUMES * 1 + v] * beta[1];

			zsquared += eps * eps;
			zu += eps * old_eps;

			old_eps = eps;
		}

		// Generate rho
		float InvAT = InvA0 + zsquared / sigma2;
		float AT = 1.0f / InvAT;
		rhoT = AT * zu / sigma2;
		MultivariateRandom1(&rhoProp,rhoT,AT,sigma2,&seed);

		if (myabs(rhoProp) < 1.0f)
		{
			rho = rhoProp;
		}

		// Prewhitening of regressors and data
		Xtildesquared[0][0] = c_S00[0 + 0*2] - 2.0f * rho * c_S01[0 + 0*2] + rho * rho * c_S11[0 + 0*2];
		Xtildesquared[0][1] = c_S00[0 + 1*2] - 2.0f * rho * c_S01[0 + 1*2] + rho * rho * c_S11[0 + 1*2];
		Xtildesquared[1][0] = c_S00[1 + 0*2] - 2.0f * rho * c_S01[1 + 0*2] + rho * rho * c_S11[1 + 0*2];
		Xtildesquared[1][1] = c_S00[1 + 1*2] - 2.0f * rho * c_S01[1 + 1*2] + rho * rho * c_S11[1 + 1*2];
		
		XtildeYtilde[0] = m00[0] - rho * (m01[0] + m10[0]) + rho * rho * m11[0];
		XtildeYtilde[1] = m00[1] - rho * (m01[1] + m10[1]) + rho * rho * m11[1];

		Ytildesquared = g00 - 2.0f * rho * g01 + rho * rho * g11;
	}
	
	Statistical_Maps[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = (float)probability1/(float)NUMBER_OF_ITERATIONS;
	Statistical_Maps[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = (float)probability2/(float)NUMBER_OF_ITERATIONS;
	Statistical_Maps[Calculate4DIndex(x,y,slice,2,DATA_W,DATA_H,DATA_D)] = (float)probability3/(float)NUMBER_OF_ITERATIONS;
	Statistical_Maps[Calculate4DIndex(x,y,slice,3,DATA_W,DATA_H,DATA_D)] = (float)probability4/(float)NUMBER_OF_ITERATIONS;
	Statistical_Maps[Calculate4DIndex(x,y,slice,4,DATA_W,DATA_H,DATA_D)] = (float)probability5/(float)NUMBER_OF_ITERATIONS;
	Statistical_Maps[Calculate4DIndex(x,y,slice,5,DATA_W,DATA_H,DATA_D)] = (float)probability6/(float)NUMBER_OF_ITERATIONS;

	Beta_Volumes[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = beta[0];
	Beta_Volumes[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = beta[1];

	AR_Estimates[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)] = rhoT;
}
示例#18
0
kernel void ComputeGradientsRTLR_V0_CPU(
    global float* pValuesOfWeights
    , int uLayersCount
    , int maxULayerSize
    , int p_i_j_l_LayerIndex_0_0
    , int p_i_j_l_LayerSize_0_0
    , global float$* weights_0_0
    , int p_i_j_l_LayerIndex_1_0
    , int p_i_j_l_LayerSize_1_0
    , global float$* weights_1_0
    , int p_i_j_l_LayerIndex_2_0
    , int p_i_j_l_LayerSize_2_0
    , global float$* weights_2_0
    , int p_i_j_l_LayerIndex_3_0
    , int p_i_j_l_LayerSize_3_0
    , global float$* weights_3_0
    , int p_i_j_k_LayerSize_0
    , global float* netDerivValues_0
    , int p_i_j_l_LayerIndex_0_1
    , int p_i_j_l_LayerSize_0_1
    , global float$* weights_0_1
    , int p_i_j_l_LayerIndex_1_1
    , int p_i_j_l_LayerSize_1_1
    , global float$* weights_1_1
    , int p_i_j_l_LayerIndex_2_1
    , int p_i_j_l_LayerSize_2_1
    , global float$* weights_2_1
    , int p_i_j_l_LayerIndex_3_1
    , int p_i_j_l_LayerSize_3_1
    , global float$* weights_3_1
    , int p_i_j_k_LayerSize_1
    , global float* netDerivValues_1
    , int p_i_j_l_LayerIndex_0_2
    , int p_i_j_l_LayerSize_0_2
    , global float$* weights_0_2
    , int p_i_j_l_LayerIndex_1_2
    , int p_i_j_l_LayerSize_1_2
    , global float$* weights_1_2
    , int p_i_j_l_LayerIndex_2_2
    , int p_i_j_l_LayerSize_2_2
    , global float$* weights_2_2
    , int p_i_j_l_LayerIndex_3_2
    , int p_i_j_l_LayerSize_3_2
    , global float$* weights_3_2
    , int p_i_j_k_LayerSize_2
    , global float* netDerivValues_2
    , int p_i_j_l_LayerIndex_0_3
    , int p_i_j_l_LayerSize_0_3
    , global float$* weights_0_3
    , int p_i_j_l_LayerIndex_1_3
    , int p_i_j_l_LayerSize_1_3
    , global float$* weights_1_3
    , int p_i_j_l_LayerIndex_2_3
    , int p_i_j_l_LayerSize_2_3
    , global float$* weights_2_3
    , int p_i_j_l_LayerIndex_3_3
    , int p_i_j_l_LayerSize_3_3
    , global float$* weights_3_3
    , int p_i_j_k_LayerSize_3
    , global float* netDerivValues_3
    , int iLayerIndex
    , global float* inputs
    , int inputsSize // + bias (null) = 1, inputs: size
    , global float* outputs
    , global float* desiredOutputs
    , local float* tmpGradients // size = local size
    , global float* gradients
    , global float* gradientSums)
{
    int localId = get_local_id(0);
    int localSize = get_local_size(0);

    int ijValueIndex = get_group_id(0);
    int iValueIndex = ijValueIndex / inputsSize;
    int jValueIndex = ijValueIndex % inputsSize;

    tmpGradients[localId] = 0.0f;

    barrier(CLK_LOCAL_MEM_FENCE);

    // Local size ~ avg uLayerSize
    for (int kLayerIndex = 0; kLayerIndex < uLayersCount; kLayerIndex++)
    {
        int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex);
        bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null;

        int block = kLayerSize / localSize + (kLayerSize % localSize != 0 ? 1 : 0);
        int kValueIndex = localId * block;
        int max = kValueIndex + block;
        if (max > kLayerSize) max = kLayerSize;
        while (kValueIndex < max)
        {
            float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f;

            int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex);
            int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex);
            global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex);
            sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex);
            float p = netDerivValues[kValueIndex] * sum;
            GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p;

            if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p;

            kValueIndex++;
        }

        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (gradients != null || gradientSums != null)
    {
        ComputeGradinetsRTLR_SetGradients(tmpGradients, gradients, gradientSums);
    }

    /*int pValuesOfWeightsSize2 = uLayersCount * maxULayerSize;
    int block = pValuesOfWeightsSize2 / localSize + (pValuesOfWeightsSize2 % localSize != 0 ? 1 : 0);
    int kLayerAndValueIndex = localId * block;
    int max = kLayerAndValueIndex + block;
    if (max > pValuesOfWeightsSize2) max = pValuesOfWeightsSize2;
    while (kLayerAndValueIndex < max)
    {
        int kLayerIndex = kLayerAndValueIndex / maxULayerSize;
        int kValueIndex = kLayerAndValueIndex % maxULayerSize;
        int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex);
        
        if (kValueIndex < kLayerSize)
        {
            bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null;

            float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f;

            int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex);
            int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex);
            global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex);
            sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            
            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }
            
            global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex);
            float p = netDerivValues[kValueIndex] * sum;
            GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p;
            
            if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p;
        }

        kLayerAndValueIndex++;
    }*/
}
	__kernel void TRIPLE_DGEMM_UPDATE_192_96_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)
{ 
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 48 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2

	//Work group size is [24, 2]
	//global work size is [48*number of blocks, 4]
	//each work item in each work group is responsible for 12 elements (1/4) in that row
	//each work group is responsible for 24 by 24 macro tile;

	////////////// -invA11*invA12
	const uint gidx = get_group_id(0); 
	const uint gidy = get_group_id(1); 
	const uint idx = get_local_id(0); 
	const uint idy = get_local_id(1); 

	//uint page = gidx / 2;//0-1 for 192; 0 for 96
	//const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block
	const uint page_block = gidx / 4; //#index of page_block; 4 WG per page; 4 WG per page_block

	__global double *A, *B, *C; 
	__local double lA[24][48]; 
	__local double lB[48][24]; 
	double privateC[12] = { (double)0 }; 

	//decide invA11 location for each page
	//each workgroup loads half of A (left or right)
	//A = d_dinvA + page*NB*NB + gidx%2*(blk/2);
	A = d_dinvA + page_block*NB*NB + gidx % 4 * (blk / 4); 

	//decide invA12 (B) location for each page
	//actually it was saved in invA21 from last kernel
	//each workgroup loads half of B (up or down)
	//B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB;
	B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk / 4)*NB; 

	//decide invA12 location for each page
	//each workgroup writes 1/4 of C
	//C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB;
	C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; 

	//read A and B into LDS no transpose operated here
	//each work item loads a half row of A and half column of B
	//idx 0-23 idy 0-1

	uint block_k = blk / 48;  //thus we need 2 iterations here
	do{
		barrier(CLK_LOCAL_MEM_FENCE); 
		lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; 
		lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; 
		lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB];
		lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB];
		lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB];
		lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB];
		lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB];
		lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB];
		lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB];
		lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB];
		lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB];
		lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB];
		lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB];
		lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB];
		lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB];
		lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB];
		lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB];
		lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB];
		lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB];
		lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB];
		lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB];
		lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB];
		lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB];
		lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB];

		lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; 
		lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1]; 
		lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];
		lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];
		lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];
		lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];
		lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];
		lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];
		lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];
		lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];
		lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];
		lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];
		lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];
		lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];
		lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];
		lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];
		lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];
		lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];
		lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];
		lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];
		lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];
		lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];
		lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];
		lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];
		barrier(CLK_LOCAL_MEM_FENCE); 

		//do math

		uint i = 0; 

		do{
			privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);
			privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);
			privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);
			privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);
			privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);
			privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);
			privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);
			privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);
			privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);
			privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);
			privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); 
			privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); 
			i = i + 1; 
		} while (i < 48); 

		A += 48 * NB; 
		B += 48; 

	} while (--block_k>0); 

	uint i = 0; 
	do{
		C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; 
		i = i + 1; 
	} while (i < 12); 


}
示例#20
0
kernel void convolute(int4 imagesize, global unsigned char *input,
                      global unsigned char *output, global kernf *filterG) {
  int4 gid = (int4)(get_global_id(0)*CONV_UNROLL,  get_global_id(1),  get_global_id(2),  0);
  int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0);
  int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0);
  // First (?) pixel to process with this kernel
  int4 pixelid = gid;

  // Starting offset of the first pixel to process
  int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2;
  int i,j;

  int dx,dy,dz;

  /* MAD performs a single convolution operation for each kernel,
     using the current 'raw' value as the input image
     'ko' as an instance of an unrolled convolution filter
     'pos' as the X-offset for each of the unrolled convolution filters
     Note that all the if statements dependent only on static values -
     meaning that they can be optimized away by the compiler
  */
#define MAD(ko,pos) {if(CONV_UNROLL>ko) {    \
      if(pos-ko >= 0 && pos-ko < kernsize) {    \
        val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]);   \
      }}}
#define MADS(pos) {if(pos<kernsize) { \
    raw=input[imoffset2+pos];       \
    MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \
    MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \
    MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \
    MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \
    MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \
    }}

  kernf val[CONV_UNROLL];
  for(j=0;j<CONV_UNROLL;j++)
    val[j]=(kernf)(0.0);

  int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);
  local kernf filter[kernsize*kernsize*kernsize];

  /* Copy global filter to local memory */
  event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0);
  wait_group_events(1, &event);

  if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 ||
     gid.s1 + kernsize > imagesize.s1 ||
     gid.s2 + kernsize > imagesize.s2) return;

  for(dz=0;dz<kernsize;dz++)
    for(dy=0;dy<kernsize;dy++)  {
      int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels;
      int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1;
      unsigned char raw;

      /* kernsize + convolution_unroll < 42 */
      MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5);
      MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11);
      MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17);
      MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23);
      MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29);
      MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35);
      MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41);
    }

  for(j=0;j<CONV_UNROLL;j++) {
    kernstore( convert_kernuc(val[j]), imoffset+j, output);
  }
}
__kernel calculate(__global float *sunGeom,
                   __global float *sunVarGeom,
                   __global float *sunSlopeGeom,
                   __local float *gridGeom,
                   __constant float *const_f,
                   __constant float *const_i,
                   
                   __global float *horizonArr,
                   __constant float *z,
                   __global float *o,
                   __constant float *s,
                   __constant float *li,
                   __global float *a,
                   __constant float *latitudeArray,
                   __constant float *longitudeArray,
                   __global float *cbhr,
                   __global float *cdhr,

                   __global float *lumcl,
                   __global float *beam,
                   __global float *globrad,
                   __global float *insol,
                   __global float *diff,
                   __global float *refl )
{
    unsigned int gid = get_global_id(0);
    unsigned int gsz = const_i[0]*const_i[1];
    unsigned int lid = get_local_id(0);
    unsigned int lsz = get_local_size(0);
    float longitTime = 0.0f;
    float o_orig;
    
    //Don't overrun arrays
    if (gid >= gsz)
        return;
    
    if (const_i[3])
        longitTime = -longitudeArray[gid] / 15.0f;
    
    gridGeom[4*lsz+lid] = gridGeom[2*lsz+lid] = (float)(gid / const_i[1]) *const_f[14];
    gridGeom[5*lsz+lid] = gridGeom[3*lsz+lid] = (float)(gid % const_i[1]) *const_f[15];
    
    gridGeom[    lid] = const_f[7] + gridGeom[2*lsz+lid];
    gridGeom[lsz+lid] = const_f[8] + gridGeom[3*lsz+lid];
    
    if (const_i[13]) {
        float coslat = cos(const_f[3] * gridGeom[lsz+lid]);
        sunVarGeom[11*gsz+gid] = coslat * coslat;
    }
    
    float z1 = sunVarGeom[gsz+gid] = sunVarGeom[3*gsz+gid] = z[gid];
    
    if (z1 == UNDEFZ)
        return;
    
    float latitude, longitude, aspect, slope;
    
    if (const_i[14] != NULL) {
        if (o[gid] != 0.0f)
            aspect = sunSlopeGeom[3*gsz+gid] = o[gid] * const_f[3];
        else
            aspect = sunSlopeGeom[3*gsz+gid] = UNDEF;
    } else {
        aspect = sunSlopeGeom[3*gsz+gid] = const_f[23];
    }
    
    if (const_i[18] != NULL)
        latitude = latitudeArray[gid]*const_f[3];
    
    if (const_i[19] != NULL)
        longitude = longitudeArray[gid]*const_f[3];
    
    if (const_i[9] == PROJECTION_LL) {		/* ll projection */
        longitude = gridGeom[lid]*const_f[3];
        latitude = gridGeom[lsz+lid]*const_f[3];
    }
    
    if (const_i[15] == NULL)
        slope = const_f[22];
    else
        slope = s[gid];
        
    float cos_u = cos(const_f[1] - slope);	/* = sin(slope) */
    float sin_u = sin(const_f[1] - slope);	/* = cos(slope) */
    float cos_v = cos(const_f[1] + aspect);
    float sin_v = sin(const_f[1] + aspect);
    
    if (const_i[5] != NULL)
        sunGeom[7*gsz+gid] = const_i[8];
    
    float geom_sinlat = gridGeom[6*lsz+lid] = sin(-latitude);
    float geom_coslat = gridGeom[7*lsz+lid] = cos(-latitude);
    float sin_phi_l = -geom_coslat * cos_u * sin_v + geom_sinlat * sin_u;
    
    sunSlopeGeom[      gid] = atan(-cos_u * cos_v / (geom_sinlat * cos_u * sin_v + geom_coslat * sin_u));
    sunSlopeGeom[  gsz+gid] = cos(asin(sin_phi_l)) * const_f[40];
    sunSlopeGeom[2*gsz+gid] = sin_phi_l * const_f[39];
    
    if ((const_i[22] != NULL) || someRadiation)
        com_par_const(sunGeo, gridGeom, const_f, const_i, longitTime);
    
    if (const_i[22] != NULL) {
        com_par(sunGeom, sunVarGeom, gridGeom,
                const_f, const_i, latitude, longitude);
        float lum = lumcline2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom,
                              const_f, const_f, horizonArr, z, gid*const_i[7]);
        
        if (lum > 0.0f) {
            lum = rad2deg * asin(lum);
            lumcl[gid] = (float)lum;
        } else {
            lumcl[gid] = UNDEFZ;
        }
    }

    if (someRadiation) {
        joules2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom,
                const_f, const_i, horizonArr, z, s, li, a, cbhr, cdhr,
                beam, insol, diff, refl, globrad,
                gid*const_i[7], latitude, longitude);
    }
}
\n
\n    uA.d += offsetA;
\n    uB.d += offsetB;
\n    C    += offsetC;
\n
\n
\n    double rC[6][6] = { {(double)0} };
\n    double rA[6];
\n    double rB[6];
\n
\n    __local double lA[392];
\n    __local double lB[392];
\n
\n    int gidx = get_group_id(0);
\n    int gidy = get_group_id(1);
\n    int idx  = get_local_id(0);
\n    int idy  = get_local_id(1);
\n
\n
\n    uA.d += 2*(gidx*24 + idx) + idy*lda;
\n    uB.d += 2*(gidy*24 + idx) + idy*ldb;
\n
\n    int block_k = K >> 3;
\n    do {
\n        __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx);
\n        __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx);
\n        barrier(CLK_LOCAL_MEM_FENCE);
\n        plB[0 ] = uB.d2v[0 ];
\n        plB[8 ] = uB.d2v[8 ];
\n        plB[16] = uB.d2v[16];
\n        plA[0 ] = uA.d2v[0 ];
{
  float rC[4][4]  = { {(float)0} };
  float rA[1][4];
  float rB[1][4];


  A += offsetA;
  B += offsetB;
  C+=offsetC;

  __local float lA[1056];
  __local float lB[1056];

  uint gidx = get_group_id(0);
  uint gidy = get_group_id(1);
  uint idx = get_local_id(0);
  uint idy = get_local_id(1);

  uint idt = 16*idy + idx;
  uint idxT = idt % 16;
  uint idyT = idt / 16;

  A +=  gidx*64*lda+ idxT + idyT*lda;
  B +=  gidy*64*ldb+ idxT + idyT*ldb;


  uint block_k = K >> 4;
  do
  {
    __local float* plA = lA + idxT*65+idyT;
    __local float* plB = lB + idxT*65+idyT;
示例#24
0
__kernel void diag_dtrtri_upper_192_12_src(
int  isDiagUnit,
__global double const * restrict A, 
uint offA, 
__global double *d_dinvA, 
uint lda, 
uint na)
{ 
int i, j;
double Ystx = 0; 
__local double *y = 0; 
double switcher; 
double neg_switcher; 

// Thread index
int tx = get_local_id(0); 

// Thread index 
int gx = get_global_id(0); 

// Block index 
int bx = get_group_id(0); 

A = A + offA; 

__global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; 
int NumBLperNB = NB / BLOCK_SIZE; 
d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); 

__local double Bs[BLOCK_SIZE*BLOCK_SIZE]; 
__local double workspace[BLOCK_SIZE];	// workspace used to store the current working column 
		 __ArgLast(float, epsSqr)
// dynemically defined local memory as argument
//         __ArgLast(__local float4*, localPos)

{
    unsigned int tid = get_local_id(0);

    unsigned int gid = get_global_id(0);

    unsigned int localSize = get_local_size(0);


    // Number of tiles we need to iterate

    unsigned int numTiles = numBodies / localSize;

// statically declared local memory
    __Local(float4, localPos, 256);

    // position of this work-item

    float4 myPos = pos[gid];

    float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);



    for(int i = 0; i < (int)numTiles; ++i)
    {

        // load one tile into local memory

//        int idx = i * localSize + tid;
        uint idx = mad24( (uint)i, localSize, tid);
        localPos[tid] = pos[idx];



        // Synchronize to make sure data is available for processing

        barrier(CLK_LOCAL_MEM_FENCE);


        // calculate acceleration effect due to each body

        // a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)

        for(int j = 0; j < (int)localSize; ++j)

        {

            // Calculate acceleartion caused by particle j on particle i

            float4 r = localPos[j] - myPos;

            float distSqr = r.x * r.x  +  r.y * r.y  +  r.z * r.z;

            float invDist = 1.0f / sqrt(distSqr + epsSqr);

            float invDistCube = invDist * invDist * invDist;

            float s = localPos[j].w * invDistCube;



            // accumulate effect of all particles
            acc += ((float4)s * r);

        }



        // Synchronize so that next tile can be loaded

        barrier(CLK_LOCAL_MEM_FENCE);

    }



    float4 oldVel = vel[gid];



    // updated position and velocity

    float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;

    newPos.w = myPos.w;



    float4 newVel = oldVel + acc * deltaTime;



    // write to global memory

    pos[gid] = newPos;

    vel[gid] = newVel;

// MANDATORY
	__Return;

}
示例#26
0
__kernel void CalculateStatisticalMapSearchlight_(__global float* Classifier_Performance,
        __global const float* Volumes,
        __global const float* Mask,
        __constant float* c_d,
        __constant float* c_Correct_Classes,
        __private int DATA_W,
        __private int DATA_H,
        __private int DATA_D,
        __private int NUMBER_OF_VOLUMES,
        __private float n,
        __private int EPOCS)

{
    int x = get_global_id(0);
    int y = get_global_id(1);
    int z = get_global_id(2);

    int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)};

    if (x >= DATA_W || y >= DATA_H || z >= DATA_D)
        return;

    if ( Mask[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] != 1.0f )
    {
        Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f;
        return;
    }

    __local float l_Volume[16][16][16];    // z, y, x

    int classification_performance = 0;

    // Leave one out cross validation
    for (int validation = 0; validation < NUMBER_OF_VOLUMES; validation++)
    {
        float weights[20];

        weights[0]  = 0.0f;
        weights[1]  = 0.0f;
        weights[2]  = 0.0f;
        weights[3]  = 0.0f;
        weights[4]  = 0.0f;
        weights[5]  = 0.0f;
        weights[6]  = 0.0f;
        weights[7]  = 0.0f;
        weights[8]  = 0.0f;
        weights[9]  = 0.0f;
        weights[10] = 0.0f;
        weights[11] = 0.0f;
        weights[12] = 0.0f;
        weights[13] = 0.0f;
        weights[14] = 0.0f;
        weights[15] = 0.0f;
        weights[16] = 0.0f;
        weights[17] = 0.0f;
        weights[18] = 0.0f;
        weights[19] = 0.0f;

        // Do training for a number of iterations
        for (int epoc = 0; epoc < EPOCS; epoc++)
        {
            float gradient[20];

            gradient[0] = 0.0f;
            gradient[1] = 0.0f;
            gradient[2] = 0.0f;
            gradient[3] = 0.0f;
            gradient[4] = 0.0f;
            gradient[5] = 0.0f;
            gradient[6] = 0.0f;
            gradient[7] = 0.0f;
            gradient[8] = 0.0f;
            gradient[9] = 0.0f;
            gradient[10] = 0.0f;
            gradient[11] = 0.0f;
            gradient[12] = 0.0f;
            gradient[13] = 0.0f;
            gradient[14] = 0.0f;
            gradient[15] = 0.0f;
            gradient[16] = 0.0f;
            gradient[17] = 0.0f;
            gradient[18] = 0.0f;
            gradient[19] = 0.0f;

            for (int t = 0; t < NUMBER_OF_VOLUMES; t++)
            {
                // Skip training with validation time point
                if (t == validation)
                {
                    continue;
                }

                float s;

                // Classification for current timepoint
                ReadSphere((__local float*)l_Volume, Volumes, x, y, z, t, tIdx, DATA_W, DATA_H, DATA_D);

                // Make sure all threads have written to local memory
                barrier(CLK_LOCAL_MEM_FENCE);

                // Make classification
                s =  weights[0] * 1.0f;

                // z - 1
                s += weights[1] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 - 1]; 		//
                s += weights[2] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 - 1][tIdx.x + 4]; 		//
                s += weights[3] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4]; 			// center pixel
                s += weights[4] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
                s += weights[5] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 + 1]; 		//

                // z
                s += weights[6] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 - 1]; 	//
                s += weights[7] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4]; 		//
                s += weights[8] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 + 1]; 	//
                s += weights[9] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 - 1]; 		//
                s += weights[10] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4]; 			//	center pixel
                s += weights[11] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 + 1]; 		//
                s += weights[12] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 - 1]; 	//
                s += weights[13] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
                s += weights[14] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 + 1]; 	//

                // z + 1
                s += weights[15] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 - 1]; 		//
                s += weights[16] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 - 1][tIdx.x + 4]; 		//
                s += weights[17] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4]; 			// center pixel
                s += weights[18] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
                s += weights[19] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 + 1]; 		//

                // Calculate contribution to gradient
                gradient[0] += (s - c_d[t]) * 1.0f;

                // z - 1
                gradient[1]  += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 - 1]; 		//
                gradient[2]  += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 - 1][tIdx.x + 4]; 		//
                gradient[3]  += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4]; 			// center pixel
                gradient[4]  += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
                gradient[5]  += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 + 1]; 		//

                // z
                gradient[6]  += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 - 1]; 		//
                gradient[7]  += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4]; 			//
                gradient[8]  += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 + 1]; 		//
                gradient[9]  += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 - 1]; 			//
                gradient[10] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4]; 				//	center pixel
                gradient[11] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 + 1]; 			//
                gradient[12] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 - 1]; 		//
                gradient[13] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4]; 			//
                gradient[14] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 + 1]; 		//

                // z + 1
                gradient[15] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 - 1]; 		//
                gradient[16] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 - 1][tIdx.x + 4]; 		//
                gradient[17] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4]; 			// center pixel
                gradient[18] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
                gradient[19] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 + 1]; 		//


                // end for t
            }

            // Update weights
            weights[0] -= n/(float)NUMBER_OF_VOLUMES * gradient[0];
            weights[1] -= n/(float)NUMBER_OF_VOLUMES * gradient[1];
            weights[2] -= n/(float)NUMBER_OF_VOLUMES * gradient[2];
            weights[3] -= n/(float)NUMBER_OF_VOLUMES * gradient[3];
            weights[4] -= n/(float)NUMBER_OF_VOLUMES * gradient[4];
            weights[5] -= n/(float)NUMBER_OF_VOLUMES * gradient[5];
            weights[6] -= n/(float)NUMBER_OF_VOLUMES * gradient[6];
            weights[7] -= n/(float)NUMBER_OF_VOLUMES * gradient[7];
            weights[8] -= n/(float)NUMBER_OF_VOLUMES * gradient[8];
            weights[9] -= n/(float)NUMBER_OF_VOLUMES * gradient[9];
            weights[10] -= n/(float)NUMBER_OF_VOLUMES * gradient[10];
            weights[11] -= n/(float)NUMBER_OF_VOLUMES * gradient[11];
            weights[12] -= n/(float)NUMBER_OF_VOLUMES * gradient[12];
            weights[13] -= n/(float)NUMBER_OF_VOLUMES * gradient[13];
            weights[14] -= n/(float)NUMBER_OF_VOLUMES * gradient[14];
            weights[15] -= n/(float)NUMBER_OF_VOLUMES * gradient[15];
            weights[16] -= n/(float)NUMBER_OF_VOLUMES * gradient[16];
            weights[17] -= n/(float)NUMBER_OF_VOLUMES * gradient[17];
            weights[18] -= n/(float)NUMBER_OF_VOLUMES * gradient[18];
            weights[19] -= n/(float)NUMBER_OF_VOLUMES * gradient[19];

            // end for epocs
        }

        // Make classification on validation timepoint

        ReadSphere((__local float*)l_Volume, Volumes, x, y, z, validation, tIdx, DATA_W, DATA_H, DATA_D);

        // Make sure all threads have written to local memory
        barrier(CLK_LOCAL_MEM_FENCE);

        // Make classification
        float s;
        s =  weights[0] * 1.0f;

        // z - 1
        s += weights[1] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 - 1]; 		//
        s += weights[2] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 - 1][tIdx.x + 4]; 		//
        s += weights[3] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4]; 			// center pixel
        s += weights[4] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
        s += weights[5] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 + 1]; 		//

        // z
        s += weights[6] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 - 1]; 		//
        s += weights[7] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4]; 			//
        s += weights[8] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 + 1]; 		//
        s += weights[9] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 - 1]; 			//
        s += weights[10] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4]; 			//	center pixel
        s += weights[11] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 + 1]; 		//
        s += weights[12] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 - 1]; 	//
        s += weights[13] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
        s += weights[14] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 + 1]; 	//

        // z + 1
        s += weights[15] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 - 1]; 		//
        s += weights[16] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 - 1][tIdx.x + 4]; 		//
        s += weights[17] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4]; 			// center pixel
        s += weights[18] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 + 1][tIdx.x + 4]; 		//
        s += weights[19] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 + 1]; 		//

        float classification;
        if (s > 0.0f)
        {
            classification = 0.0f;
        }
        else
        {
            classification = 1.0f;
        }

        if (classification == c_Correct_Classes[validation])
        {
            classification_performance++;
        }

        // end for validation
    }

    Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = (float)classification_performance / (float)NUMBER_OF_VOLUMES;
}
示例#27
0
__kernel void CalculateStatisticalMapSearchlight___(__global float* Classifier_Performance,
        __global const float* Volumes,
        __global const float* Mask,
        __constant float* c_d,
        __constant float* c_Correct_Classes,
        __private int DATA_W,
        __private int DATA_H,
        __private int DATA_D,
        __private int NUMBER_OF_VOLUMES,

        __private float n,
        __private int EPOCS)

{
    int x = get_global_id(0);
    int y = get_global_id(1);
    int z = get_global_id(2);

    int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)};

    if (x >= DATA_W || y >= DATA_H || z >= DATA_D)
        return;

    if ( Mask[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] != 1.0f )
    {
        Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f;
        return;
    }

    if ( ((x + 1) >= DATA_W) || ((y + 1) >= DATA_H) || ((z + 1) >= DATA_D) )
    {
        Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f;
        return;
    }


    if ( ((x - 1) < 0) || ((y - 1) < 0) || ((z - 1) < 0) )
    {
        Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f;
        return;
    }


    int classification_performance = 0;


    //
    // Training
    //

    float weights[20];

    weights[0]  = 0.0f;
    weights[1]  = 0.0f;
    weights[2]  = 0.0f;
    weights[3]  = 0.0f;
    weights[4]  = 0.0f;
    weights[5]  = 0.0f;
    weights[6]  = 0.0f;
    weights[7]  = 0.0f;
    weights[8]  = 0.0f;
    weights[9]  = 0.0f;
    weights[10] = 0.0f;
    weights[11] = 0.0f;
    weights[12] = 0.0f;
    weights[13] = 0.0f;
    weights[14] = 0.0f;
    weights[15] = 0.0f;
    weights[16] = 0.0f;
    weights[17] = 0.0f;
    weights[18] = 0.0f;
    weights[19] = 0.0f;

    // Do training for a number of iterations
    for (int epoc = 0; epoc < EPOCS; epoc++)
    {
        float gradient[20];

        gradient[0] = 0.0f;
        gradient[1] = 0.0f;
        gradient[2] = 0.0f;
        gradient[3] = 0.0f;
        gradient[4] = 0.0f;
        gradient[5] = 0.0f;
        gradient[6] = 0.0f;
        gradient[7] = 0.0f;
        gradient[8] = 0.0f;
        gradient[9] = 0.0f;
        gradient[10] = 0.0f;
        gradient[11] = 0.0f;
        gradient[12] = 0.0f;
        gradient[13] = 0.0f;
        gradient[14] = 0.0f;
        gradient[15] = 0.0f;
        gradient[16] = 0.0f;
        gradient[17] = 0.0f;
        gradient[18] = 0.0f;
        gradient[19] = 0.0f;

        for (int t = 0; t < NUMBER_OF_VOLUMES / 2; t++)
        {
            // Ignore censored volumes
            if (c_Correct_Classes[t] == 9999.0f)
            {
                continue;
            }

            // Make classification
            float s;
            s =  weights[0] * 1.0f;

            float x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19;

            x1 = Volumes[Calculate4DIndex(x-1,y,z-1,t,DATA_W,DATA_H,DATA_D)];
            x2 = Volumes[Calculate4DIndex(x,y-1,z-1,t,DATA_W,DATA_H,DATA_D)];
            x3 = Volumes[Calculate4DIndex(x,y,z-1,t,DATA_W,DATA_H,DATA_D)];
            x4 = Volumes[Calculate4DIndex(x,y+1,z-1,t,DATA_W,DATA_H,DATA_D)];
            x5 = Volumes[Calculate4DIndex(x+1,y,z-1,t,DATA_W,DATA_H,DATA_D)];

            x6 = Volumes[Calculate4DIndex(x-1,y-1,z,t,DATA_W,DATA_H,DATA_D)];
            x7 = Volumes[Calculate4DIndex(x-1,y,z,t,DATA_W,DATA_H,DATA_D)];
            x8 = Volumes[Calculate4DIndex(x-1,y+1,z,t,DATA_W,DATA_H,DATA_D)];
            x9 = Volumes[Calculate4DIndex(x,y-1,z,t,DATA_W,DATA_H,DATA_D)];
            x10 = Volumes[Calculate4DIndex(x,y,z,t,DATA_W,DATA_H,DATA_D)];
            x11 = Volumes[Calculate4DIndex(x,y+1,z,t,DATA_W,DATA_H,DATA_D)];
            x12 = Volumes[Calculate4DIndex(x+1,y-1,z,t,DATA_W,DATA_H,DATA_D)];
            x13 = Volumes[Calculate4DIndex(x+1,y,z,t,DATA_W,DATA_H,DATA_D)];
            x14 = Volumes[Calculate4DIndex(x+1,y+1,z,t,DATA_W,DATA_H,DATA_D)];

            x15 = Volumes[Calculate4DIndex(x-1,y,z+1,t,DATA_W,DATA_H,DATA_D)];
            x16 = Volumes[Calculate4DIndex(x,y-1,z+1,t,DATA_W,DATA_H,DATA_D)];
            x17 = Volumes[Calculate4DIndex(x,y,z+1,t,DATA_W,DATA_H,DATA_D)];
            x18 = Volumes[Calculate4DIndex(x,y+1,z+1,t,DATA_W,DATA_H,DATA_D)];
            x19 = Volumes[Calculate4DIndex(x+1,y,z+1,t,DATA_W,DATA_H,DATA_D)];

            // z - 1
            s += weights[1] * x1;
            s += weights[2] * x2;
            s += weights[3] * x3;
            s += weights[4] * x4;
            s += weights[5] * x5;

            // z
            s += weights[6] * x6;
            s += weights[7] * x7;
            s += weights[8] * x8;
            s += weights[9] * x9;
            s += weights[10] * x10;
            s += weights[11] * x11;
            s += weights[12] * x12;
            s += weights[13] * x13;
            s += weights[14] * x14;

            // z + 1
            s += weights[15] * x15;
            s += weights[16] * x16;
            s += weights[17] * x17;
            s += weights[18] * x18;
            s += weights[19] * x19;

            // Calculate contribution to gradient
            gradient[0] += (s - c_d[t]) * 1.0f;

            // z - 1
            gradient[1]  += (s - c_d[t]) * x1;
            gradient[2]  += (s - c_d[t]) * x2;
            gradient[3]  += (s - c_d[t]) * x3;
            gradient[4]  += (s - c_d[t]) * x4;
            gradient[5]  += (s - c_d[t]) * x5;

            // z
            gradient[6]  += (s - c_d[t]) * x6;
            gradient[7]  += (s - c_d[t]) * x7;
            gradient[8]  += (s - c_d[t]) * x8;
            gradient[9]  += (s - c_d[t]) * x9;
            gradient[10] += (s - c_d[t]) * x10;
            gradient[11] += (s - c_d[t]) * x11;
            gradient[12] += (s - c_d[t]) * x12;
            gradient[13] += (s - c_d[t]) * x13;
            gradient[14] += (s - c_d[t]) * x14;

            // z + 1
            gradient[15] += (s - c_d[t]) * x15;
            gradient[16] += (s - c_d[t]) * x16;
            gradient[17] += (s - c_d[t]) * x17;
            gradient[18] += (s - c_d[t]) * x18;
            gradient[19] += (s - c_d[t]) * x19;

            // end for t
        }

        // Update weights
        weights[0] -= n/(float)NUMBER_OF_VOLUMES * gradient[0];
        weights[1] -= n/(float)NUMBER_OF_VOLUMES * gradient[1];
        weights[2] -= n/(float)NUMBER_OF_VOLUMES * gradient[2];
        weights[3] -= n/(float)NUMBER_OF_VOLUMES * gradient[3];
        weights[4] -= n/(float)NUMBER_OF_VOLUMES * gradient[4];
        weights[5] -= n/(float)NUMBER_OF_VOLUMES * gradient[5];
        weights[6] -= n/(float)NUMBER_OF_VOLUMES * gradient[6];
        weights[7] -= n/(float)NUMBER_OF_VOLUMES * gradient[7];
        weights[8] -= n/(float)NUMBER_OF_VOLUMES * gradient[8];
        weights[9] -= n/(float)NUMBER_OF_VOLUMES * gradient[9];
        weights[10] -= n/(float)NUMBER_OF_VOLUMES * gradient[10];
        weights[11] -= n/(float)NUMBER_OF_VOLUMES * gradient[11];
        weights[12] -= n/(float)NUMBER_OF_VOLUMES * gradient[12];
        weights[13] -= n/(float)NUMBER_OF_VOLUMES * gradient[13];
        weights[14] -= n/(float)NUMBER_OF_VOLUMES * gradient[14];
        weights[15] -= n/(float)NUMBER_OF_VOLUMES * gradient[15];
        weights[16] -= n/(float)NUMBER_OF_VOLUMES * gradient[16];
        weights[17] -= n/(float)NUMBER_OF_VOLUMES * gradient[17];
        weights[18] -= n/(float)NUMBER_OF_VOLUMES * gradient[18];
        weights[19] -= n/(float)NUMBER_OF_VOLUMES * gradient[19];

        // end for epocs
    }


    //
    // Testing
    //

    float s;

    int uncensoredVolumes = 0;

    // Make classifications
    for (int t = NUMBER_OF_VOLUMES / 2 + 1; t < NUMBER_OF_VOLUMES; t++)
    {
        // Ignore censored volumes
        if (c_Correct_Classes[t] == 9999.0f)
        {
            continue;
        }

        uncensoredVolumes++;

        s =  weights[0] * 1.0f;

        float x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19;

        x1 = Volumes[Calculate4DIndex(x-1,y,z-1,t,DATA_W,DATA_H,DATA_D)];
        x2 = Volumes[Calculate4DIndex(x,y-1,z-1,t,DATA_W,DATA_H,DATA_D)];
        x3 = Volumes[Calculate4DIndex(x,y,z-1,t,DATA_W,DATA_H,DATA_D)];
        x4 = Volumes[Calculate4DIndex(x,y+1,z-1,t,DATA_W,DATA_H,DATA_D)];
        x5 = Volumes[Calculate4DIndex(x+1,y,z-1,t,DATA_W,DATA_H,DATA_D)];

        x6 = Volumes[Calculate4DIndex(x-1,y-1,z,t,DATA_W,DATA_H,DATA_D)];
        x7 = Volumes[Calculate4DIndex(x-1,y,z,t,DATA_W,DATA_H,DATA_D)];
        x8 = Volumes[Calculate4DIndex(x-1,y+1,z,t,DATA_W,DATA_H,DATA_D)];
        x9 = Volumes[Calculate4DIndex(x,y-1,z,t,DATA_W,DATA_H,DATA_D)];
        x10 = Volumes[Calculate4DIndex(x,y,z,t,DATA_W,DATA_H,DATA_D)];

        x11 = Volumes[Calculate4DIndex(x,y+1,z,t,DATA_W,DATA_H,DATA_D)];
        x12 = Volumes[Calculate4DIndex(x+1,y-1,z,t,DATA_W,DATA_H,DATA_D)];
        x13 = Volumes[Calculate4DIndex(x+1,y,z,t,DATA_W,DATA_H,DATA_D)];
        x14 = Volumes[Calculate4DIndex(x+1,y+1,z,t,DATA_W,DATA_H,DATA_D)];

        x15 = Volumes[Calculate4DIndex(x-1,y,z+1,t,DATA_W,DATA_H,DATA_D)];
        x16 = Volumes[Calculate4DIndex(x,y-1,z+1,t,DATA_W,DATA_H,DATA_D)];
        x17 = Volumes[Calculate4DIndex(x,y,z+1,t,DATA_W,DATA_H,DATA_D)];
        x18 = Volumes[Calculate4DIndex(x,y+1,z+1,t,DATA_W,DATA_H,DATA_D)];
        x19 = Volumes[Calculate4DIndex(x+1,y,z+1,t,DATA_W,DATA_H,DATA_D)];

        // z - 1
        s += weights[1] * x1;
        s += weights[2] * x2;
        s += weights[3] * x3;
        s += weights[4] * x4;
        s += weights[5] * x5;

        // z
        s += weights[6] * x6;
        s += weights[7] * x7;
        s += weights[8] * x8;
        s += weights[9] * x9;
        s += weights[10] * x10;
        s += weights[11] * x11;
        s += weights[12] * x12;
        s += weights[13] * x13;
        s += weights[14] * x14;

        // z + 1
        s += weights[15] * x15;
        s += weights[16] * x16;
        s += weights[17] * x17;
        s += weights[18] * x18;
        s += weights[19] * x19;

        float classification;
        if (s > 0.0f)
        {
            classification = 0.0f;
        }
        else
        {
            classification = 1.0f;
        }

        if (classification == c_Correct_Classes[t])
        {
            classification_performance++;
        }
    }

    Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = (float)classification_performance / (float)uncensoredVolumes;
}
示例#28
0
void step_bodies(
		struct Body * bodies,
		struct Pair * pairs,
		unsigned int * map,
		float dt,
		unsigned int num_bodies, // in
		float * velocity_ratio, // in/out
		float * mass_center, // in
		float mass, // in
		unsigned int * number_escaped // out
		)
{
	/* work group */
	int local_block = num_bodies / get_num_groups(0);

	unsigned int i_group0 = get_group_id(0) * local_block;
	unsigned int i_group1 = i_group0 + local_block;

	if(get_group_id(0) == (get_num_groups(0) - 1)) i_group1 = num_bodies;

	/* work item */
	int block = (i_group1 - i_group0) / get_local_size(0);

	unsigned int i_local0 = i_group0 + get_local_id(0) * block;
	unsigned int i_local1 = i_local0 + block;

	if(get_local_id(0) == (get_local_size(0) - 1)) i_local1 = i_group1;

	/*
	   printf("local_block = %i\n", local_block);
	   printf("block = %i\n", block);
	   */
	/*
	   printf("i_local0 = %i\n", i_local0);
	   printf("i_local1 = %i\n", i_local1);
	   */
	/* copy data for work group */
	//__local struct Pair local_pairs[NUM_PAIRS];
	//__local struct BodyMap local_bodymaps[NUM_BODIES / NUM_GROUPS];

	//event_t e0 = async_work_group_copy((__local char *)local_pairs, (char *)pairs, NUM_PAIRS * sizeof(struct Pair), 0);
	//wait_group_events(1, &e0);

	//event_t e1 = async_work_group_copy((__local char *)local_bodymaps, (char *)(bodymaps + i_group0), (i_group1 - i_group0) * sizeof(struct BodyMap), 0);
	//wait_group_events(1, &e1);

	/* */
	float f[3];

	//__local struct BodyMap * pbm = 0;
	//struct BodyMap * pbm = 0;


	Body * pb = 0;

	for(unsigned int b = i_local0; b < i_local1; b++)
	{
		//pbm = local_bodymaps + b;
		//pbm = bodymaps + b;

		pb = bodies + b;

		if(pb->alive == 0)
		{
			//puts("body dead");
			continue;
		}

		f[0] = 0;
		f[1] = 0;
		f[2] = 0;

		for(unsigned int i = 0; i < num_bodies; i++)
		{
			if(b == i) continue;

			//__local struct Pair * pp = &local_pairs[pbm->pair[p]];
			Pair * pp = pairs + map[b * num_bodies + i];

			if(pp->_M_alive == 0) continue;


			if(pp->b0 == b)
			{
				f[0] -= pp->u[0] * pp->f;
				f[1] -= pp->u[1] * pp->f;
				f[2] -= pp->u[2] * pp->f;
			}
			else if(pp->b1 == b)
			{
				f[0] += pp->u[0] * pp->f;
				f[1] += pp->u[1] * pp->f;
				f[2] += pp->u[2] * pp->f;
			}
			else
			{
				assert(0);
			}
		}

		float dv[3];

		if(0)
		{
			dv[0] = dt * f[0] / pb->mass;
			dv[1] = dt * f[1] / pb->mass;
			dv[2] = dt * f[2] / pb->mass;
		}
		else
		{
			dv[0] = dt * pb->f[0] / pb->mass;
			dv[1] = dt * pb->f[1] / pb->mass;
			dv[2] = dt * pb->f[2] / pb->mass;
		}
		//print(pb->f);

		if(
				(!feq(pb->f[0], f[0])) ||
				(!feq(pb->f[1], f[1])) ||
				(!feq(pb->f[2], f[2]))
		  )
		{
			print(f);
			print(pb->f);
			abort();
		}

		assert(std::isfinite(pb->mass));
		assert(std::isfinite(dt));
		assert(std::isfinite(pb->f[0]));
		assert(std::isfinite(pb->f[1]));
		assert(std::isfinite(pb->f[2]));

		// reset accumulating force
		pb->f[0] = 0;
		pb->f[1] = 0;
		pb->f[2] = 0;


		float e = 0.01;

		float rat[3];
		rat[0] = fabs(dv[0] / pb->v[0]);
		rat[1] = fabs(dv[1] / pb->v[1]);
		rat[2] = fabs(dv[2] / pb->v[2]);

		// atomic
		if(std::isfinite(rat[0])) if(rat[0] > velocity_ratio[0]) velocity_ratio[0] = rat[0];
		if(std::isfinite(rat[1])) if(rat[1] > velocity_ratio[1]) velocity_ratio[1] = rat[1];
		if(std::isfinite(rat[2])) if(rat[2] > velocity_ratio[2]) velocity_ratio[2] = rat[2];

		if(0)
		{
			if(
					((std::isfinite(rat[0])) && (rat[0] > e)) ||
					((std::isfinite(rat[1])) && (rat[1] > e)) ||
					((std::isfinite(rat[2])) && (rat[2] > e))
			  )
			{
				printf("% 12f % 12f % 12f\n",
						rat[0],
						rat[1],
						rat[2]);
			}
		}

		pb->v[0] += dv[0];
		pb->v[1] += dv[1];
		pb->v[2] += dv[2];

		pb->x[0] += dt * pb->v[0];
		pb->x[1] += dt * pb->v[1];
		pb->x[2] += dt * pb->v[2];

		// distance from mass center
		float r[3];
		r[0] = pb->x[0] - mass_center[0];
		r[1] = pb->x[1] - mass_center[1];
		r[2] = pb->x[2] - mass_center[2];

		float d = sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);

		float escape_speed2 = 2.0 * 6.67e-11 * mass / d;

		float s2 = pb->v[0]*pb->v[0] + pb->v[1]*pb->v[1] + pb->v[2]*pb->v[2];

		// dot product of velocity and displacement vector
		float dot = pb->v[0] * r[0] + pb->v[1] * r[1] + pb->v[2] * r[2];

		if(s2 > (escape_speed2)) // speed exceeds escape speed
		{
			if(dot > 0.0) // parallel componenet points away from mass_center
			{
				// atomic
				(*number_escaped)++;
				//printf("escape!\n");
			}
		}
	}
}
	__kernel void TRIPLE_DGEMM_UPDATE_192_48_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
{\n
	// Ain is the non inverse matrix; the size of Ain is lda * na
	// offAin is the offset of Ain
	// d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1
	// blk is subblock size, which is 48 here.
	// lda in leading dimension. Column major here
	// npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2

	//Work group size is [24, 2]
	//global work size is [96*number of blocks, 4]
	//each work item in each work group is responsible for 12 elements (1/4) in that row
	//each work group is responsible for 24 by 24 macro tile;

	////////////// A12*invA22
	const uint gidx = get_group_id(0);\n
	const uint gidy = get_group_id(1);\n
	const uint idx = get_local_id(0);\n
	const uint idy = get_local_id(1);\n

	//uint page = gidx / 2;//0-1 for 192; 0 for 96
	const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block
	const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block

	__global double *B, *C; \n
	__local double lA[24][48]; \n
	__local double lB[48][24]; \n
	double privateC[12] = { (double)0 }; \n

	//decide A12 location for each page
	//each workgroup loads half of A (left or right)
	Ain = Ain + offAin; \n
	Ain += page_block*NB*lda + page_block*NB + page*blk * 2 * lda + page*blk * 2 + blk*lda + gidx % 2 * (blk / 2); \n

	//decide invA22 (B) location for each page
	//each workgroup loads half of B (up or down)
	B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n

	//decide invA12 location for each page;
	//Actually this will be stored in invA21 temporarily
	//each workgroup writes 1/4 of C
	C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n

	//read A and B into LDS no transpose operated here
	//each work item loads a half row of A and half column of B
	//idx 0-23 idy 0-1
	lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n
	lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n
	lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n
	lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n
	lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n
	lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n
	lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n
	lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n
	lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n
	lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n
	lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n
	lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n
	lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n
	lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n
	lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n
	lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n
	lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n
	lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n
	lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n
	lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n
	lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n
	lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n
	lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n
	lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n

	lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n
	lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n
	lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n
	lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n
	lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n
	lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n
	lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n
	lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n
	lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n
	lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n
	lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n
	lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n
	lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n
	lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n
	lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n
	lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n
	lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n
	lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n
	lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n
	lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n
	lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n
	lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n
	lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n
	lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n
	barrier(CLK_LOCAL_MEM_FENCE); \n

	//do math

	uint i = 0; \n

	do{\n
		privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n
		privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n
		privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n
		privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n
		privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n
		privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n
		privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n
		privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n
		privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n
		privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n
		privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n
		privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n
		i = i + 1; \n
	} while (i < 48); \n

	i = 0; \n
	do{\n
		C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n
		i = i + 1; \n
	} while (i < 12); \n
}\n