kernel void CylinderKernel(global const real *qx, global const real *qy, global real *result, #ifdef USE_OPENCL global real *loops_g, #else const int Nq, #endif local real *loops, const real cutoff, const real scale, const real background, const real sub, const int Nradius, const int Nlength, const int Ntheta, const int Nphi) { #ifdef USE_OPENCL // copy loops info to local memory event_t e = async_work_group_copy(loops, loops_g, (Nradius+Nlength+Ntheta+Nphi)*2, 0); wait_group_events(1, &e); int i = get_global_id(0); int count = get_global_size(0); if(i < count) #else #pragma omp parallel for for (int i=0; i<Nq; i++) #endif { const real qxi=qx[i]; const real qyi=qy[i]; real ret=REAL(0.0), norm=REAL(0.0), norm_vol=REAL(0.0), vol=REAL(0.0); for (int ri=0; ri < Nradius; ri++) { const real rv = loops[2*ri]; const real rw = loops[2*ri+1]; for (int li=0; li < Nlength; li++) { const real lv = loops[2*(li+Nradius)]; const real lw = loops[2*(li+Nradius)+1]; for (int thi=0; thi < Ntheta; thi++) { const real thv = loops[2*(thi+Nradius+Nlength)]; const real thw = loops[2*(thi+Nradius+Nlength)+1]; // #pragma unroll for (int phi=0; phi < Nphi; phi++) { const real phv = loops[2*(phi+Nradius+Nlength+Ntheta)]; const real phw = loops[2*(phi+Nradius+Nlength+Ntheta)+1]; const real weight = rw*lw*thw*phw; //ret += qxi + qyi + sub + rv + lv + weight + thv + phv; if (weight > cutoff) { ret += f(qxi, qyi, sub, rv, lv, weight, thv, phv); norm += weight; vol += rw*lw*rv*rv*lv; norm_vol += rw*lw; } } } } } //if (Ntheta>1) norm = norm/(M_PI/2); if (vol != REAL(0.0) && norm_vol != REAL(0.0)) { ret *= norm_vol/vol; } result[i] = scale*ret/norm+background; } }
block.s8, block.s9, block.sa, block.sb, block.sc, block.sd, block.se, block.sf); printf("\n"); }*/ __kernel void AES_ECB_Encrypt( __global __read_only uchar16* restrict plainText, __global __read_only uchar16* restrict expandedKey, __global __write_only uchar16* restrict cipherText, const unsigned int rounds) { __local uchar16 localExpandedKey[15]; event_t cacheEvent; cacheEvent = async_work_group_copy( localExpandedKey, expandedKey, rounds, cacheEvent ); const int global_id = get_global_id(0); uchar16 state = plainText[global_id]; wait_group_events(1, &cacheEvent); state = AES_AddRoundKey(state, localExpandedKey[0]); for (int i = 1; i < rounds - 1; ++i) { state = AES_SubBytes(state); state = AES_ShiftRows(state); state = AES_MixColumns(state); state = AES_AddRoundKey(state, localExpandedKey[i]);
kernel void EllipsoidKernel(global const real *qx, global const real *qy, global real *result, #ifdef USE_OPENCL global real *loops_g, #else const int Nq, #endif local real *loops, const real cutoff, const real scale, const real background, const real sub, const int Nradius_a, const int Nradius_b, const int Ntheta, const int Nphi) { #ifdef USE_OPENCL // copy loops info to local memory event_t e = async_work_group_copy(loops, loops_g, (Nradius_a+Nradius_b+Ntheta+Nphi)*2, 0); wait_group_events(1, &e); int i = get_global_id(0); int count = get_global_size(0); if(i < count) #else #pragma omp parallel for for (int i=0; i<Nq; i++) #endif { const real qxi=qx[i]; const real qyi=qy[i]; real ret=REAL(0.0), norm=REAL(0.0), norm_vol=REAL(0.0), vol=REAL(0.0); for (int ai=0; ai < Nradius_a; ai++) { const real rav = loops[2*ai]; const real raw = loops[2*ai+1]; for (int bi=0; bi < Nradius_b; bi++) { const real rbv = loops[2*(bi+Nradius_a)]; const real rbw = loops[2*(bi+Nradius_a)+1]; for (int thi=0; thi < Ntheta; thi++) { const real thv = loops[2*(thi+Nradius_a+Nradius_b)]; const real thw = loops[2*(thi+Nradius_a+Nradius_b)+1]; // #pragma unroll for (int phi=0; phi < Nphi; phi++) { const real phv = loops[2*(phi+Nradius_a+Nradius_b+Ntheta)]; const real phw = loops[2*(phi+Nradius_a+Nradius_b+Ntheta)+1]; const real weight = raw*rbw*thw*phw; //ret += qxi + qyi + sub + rav + rbv + weight + thv + phv; if (weight > cutoff) { ret += f(qxi, qyi, sub, rav, rbv, weight, thv, phv); norm += weight; vol += raw*rbw*rav*rbv*rbv; norm_vol += raw*rbw; } } } } } //if (Ntheta>1) norm = norm/(M_PI/2); if (vol != REAL(0.0) && norm_vol != REAL(0.0)) { ret *= norm_vol/vol; } result[i] = scale*ret/norm+background; } }
kernel void convolute(int4 imagesize, global unsigned char *input, global unsigned char *output, global kernf *filterG) { int4 gid = (int4)(get_global_id(0)*CONV_UNROLL, get_global_id(1), get_global_id(2), 0); int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0); int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0); // First (?) pixel to process with this kernel int4 pixelid = gid; // Starting offset of the first pixel to process int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2; int i,j; int dx,dy,dz; /* MAD performs a single convolution operation for each kernel, using the current 'raw' value as the input image 'ko' as an instance of an unrolled convolution filter 'pos' as the X-offset for each of the unrolled convolution filters Note that all the if statements dependent only on static values - meaning that they can be optimized away by the compiler */ #define MAD(ko,pos) {if(CONV_UNROLL>ko) { \ if(pos-ko >= 0 && pos-ko < kernsize) { \ val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]); \ }}} #define MADS(pos) {if(pos<kernsize) { \ raw=input[imoffset2+pos]; \ MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \ MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \ MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \ MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \ MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \ }} kernf val[CONV_UNROLL]; for(j=0;j<CONV_UNROLL;j++) val[j]=(kernf)(0.0); int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2); local kernf filter[kernsize*kernsize*kernsize]; /* Copy global filter to local memory */ event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0); wait_group_events(1, &event); if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 || gid.s1 + kernsize > imagesize.s1 || gid.s2 + kernsize > imagesize.s2) return; for(dz=0;dz<kernsize;dz++) for(dy=0;dy<kernsize;dy++) { int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels; int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1; unsigned char raw; /* kernsize + convolution_unroll < 42 */ MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5); MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11); MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17); MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23); MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29); MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35); MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41); } for(j=0;j<CONV_UNROLL;j++) { kernstore( convert_kernuc(val[j]), imoffset+j, output); } }