Exemple #1
0
kernel void CylinderKernel(global const real *qx, global const real *qy, global real *result,
#ifdef USE_OPENCL
    global real *loops_g,
#else
    const int Nq,
#endif
    local real *loops, const real cutoff,
    const real scale, const real background,
    const real sub,
    const int Nradius, const int Nlength, const int Ntheta, const int Nphi)
{
#ifdef USE_OPENCL
    // copy loops info to local memory
    event_t e = async_work_group_copy(loops, loops_g, (Nradius+Nlength+Ntheta+Nphi)*2, 0);
    wait_group_events(1, &e);

    int i = get_global_id(0);
    int count = get_global_size(0);

    if(i < count)
#else
    #pragma omp parallel for
    for (int i=0; i<Nq; i++)
#endif
    {
        const real qxi=qx[i];
        const real qyi=qy[i];
        real ret=REAL(0.0), norm=REAL(0.0), norm_vol=REAL(0.0), vol=REAL(0.0);
        for (int ri=0; ri < Nradius; ri++) {
            const real rv = loops[2*ri];
            const real rw = loops[2*ri+1];
            for (int li=0; li < Nlength; li++) {
                const real lv = loops[2*(li+Nradius)];
                const real lw = loops[2*(li+Nradius)+1];
                for (int thi=0; thi < Ntheta; thi++) {
                    const real thv = loops[2*(thi+Nradius+Nlength)];
                    const real thw = loops[2*(thi+Nradius+Nlength)+1];
                    // #pragma unroll
                    for (int phi=0; phi < Nphi; phi++) {
                        const real phv = loops[2*(phi+Nradius+Nlength+Ntheta)];
                        const real phw = loops[2*(phi+Nradius+Nlength+Ntheta)+1];

                        const real weight = rw*lw*thw*phw;
                        //ret += qxi + qyi + sub + rv + lv + weight + thv + phv;
                        if (weight > cutoff) {
                            ret += f(qxi, qyi, sub, rv, lv, weight, thv, phv);
                            norm += weight;
                            vol += rw*lw*rv*rv*lv;
                            norm_vol += rw*lw;
                        }
                    }
                }
            }
        }
        //if (Ntheta>1) norm = norm/(M_PI/2);
        if (vol != REAL(0.0) && norm_vol != REAL(0.0)) {
            ret *= norm_vol/vol;
        }
        result[i] = scale*ret/norm+background;
    }
}
Exemple #2
0
           block.s8, block.s9, block.sa, block.sb, block.sc, block.sd, block.se, block.sf);
    printf("\n");
}*/

__kernel void AES_ECB_Encrypt(
    __global __read_only uchar16* restrict plainText,
    __global __read_only uchar16* restrict expandedKey,
    __global __write_only uchar16* restrict cipherText,
    const unsigned int rounds)
{
    __local uchar16 localExpandedKey[15];

    event_t cacheEvent;
    cacheEvent = async_work_group_copy(
        localExpandedKey,
        expandedKey,
        rounds,
        cacheEvent
    );

    const int global_id = get_global_id(0);
    uchar16 state = plainText[global_id];
    wait_group_events(1, &cacheEvent);

    state = AES_AddRoundKey(state, localExpandedKey[0]);

    for (int i = 1; i < rounds - 1; ++i)
    {
        state = AES_SubBytes(state);
        state = AES_ShiftRows(state);
        state = AES_MixColumns(state);
        state = AES_AddRoundKey(state, localExpandedKey[i]);
Exemple #3
0
kernel void EllipsoidKernel(global const real *qx, global const real *qy, global real *result,
#ifdef USE_OPENCL
    global real *loops_g,
#else
    const int Nq,
#endif
    local real *loops, const real cutoff,
    const real scale, const real background,
    const real sub,
    const int Nradius_a, const int Nradius_b, const int Ntheta, const int Nphi)
{
#ifdef USE_OPENCL
    // copy loops info to local memory
    event_t e = async_work_group_copy(loops, loops_g, (Nradius_a+Nradius_b+Ntheta+Nphi)*2, 0);
    wait_group_events(1, &e);

    int i = get_global_id(0);
    int count = get_global_size(0);

    if(i < count)
#else
    #pragma omp parallel for
    for (int i=0; i<Nq; i++)
#endif
{
        const real qxi=qx[i];
        const real qyi=qy[i];
        real ret=REAL(0.0), norm=REAL(0.0), norm_vol=REAL(0.0), vol=REAL(0.0);
        for (int ai=0; ai < Nradius_a; ai++) {
            const real rav = loops[2*ai];
            const real raw = loops[2*ai+1];
            for (int bi=0; bi < Nradius_b; bi++) {
                const real rbv = loops[2*(bi+Nradius_a)];
                const real rbw = loops[2*(bi+Nradius_a)+1];
                for (int thi=0; thi < Ntheta; thi++) {
                    const real thv = loops[2*(thi+Nradius_a+Nradius_b)];
                    const real thw = loops[2*(thi+Nradius_a+Nradius_b)+1];
                    // #pragma unroll
                    for (int phi=0; phi < Nphi; phi++) {
                        const real phv = loops[2*(phi+Nradius_a+Nradius_b+Ntheta)];
                        const real phw = loops[2*(phi+Nradius_a+Nradius_b+Ntheta)+1];

                        const real weight = raw*rbw*thw*phw;
                        //ret += qxi + qyi + sub + rav + rbv + weight + thv + phv;
                        if (weight > cutoff) {
                            ret += f(qxi, qyi, sub, rav, rbv, weight, thv, phv);
                            norm += weight;
                            vol += raw*rbw*rav*rbv*rbv;
                            norm_vol += raw*rbw;
                        }
                    }
                }
            }
        }
        //if (Ntheta>1) norm = norm/(M_PI/2);
        if (vol != REAL(0.0) && norm_vol != REAL(0.0)) {
            ret *= norm_vol/vol;
        }
        result[i] = scale*ret/norm+background;
    }
}
kernel void convolute(int4 imagesize, global unsigned char *input,
                      global unsigned char *output, global kernf *filterG) {
  int4 gid = (int4)(get_global_id(0)*CONV_UNROLL,  get_global_id(1),  get_global_id(2),  0);
  int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0);
  int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0);
  // First (?) pixel to process with this kernel
  int4 pixelid = gid;

  // Starting offset of the first pixel to process
  int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2;
  int i,j;

  int dx,dy,dz;

  /* MAD performs a single convolution operation for each kernel,
     using the current 'raw' value as the input image
     'ko' as an instance of an unrolled convolution filter
     'pos' as the X-offset for each of the unrolled convolution filters
     Note that all the if statements dependent only on static values -
     meaning that they can be optimized away by the compiler
  */
#define MAD(ko,pos) {if(CONV_UNROLL>ko) {    \
      if(pos-ko >= 0 && pos-ko < kernsize) {    \
        val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]);   \
      }}}
#define MADS(pos) {if(pos<kernsize) { \
    raw=input[imoffset2+pos];       \
    MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \
    MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \
    MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \
    MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \
    MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \
    }}

  kernf val[CONV_UNROLL];
  for(j=0;j<CONV_UNROLL;j++)
    val[j]=(kernf)(0.0);

  int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);
  local kernf filter[kernsize*kernsize*kernsize];

  /* Copy global filter to local memory */
  event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0);
  wait_group_events(1, &event);

  if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 ||
     gid.s1 + kernsize > imagesize.s1 ||
     gid.s2 + kernsize > imagesize.s2) return;

  for(dz=0;dz<kernsize;dz++)
    for(dy=0;dy<kernsize;dy++)  {
      int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels;
      int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1;
      unsigned char raw;

      /* kernsize + convolution_unroll < 42 */
      MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5);
      MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11);
      MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17);
      MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23);
      MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29);
      MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35);
      MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41);
    }

  for(j=0;j<CONV_UNROLL;j++) {
    kernstore( convert_kernuc(val[j]), imoffset+j, output);
  }
}